data

link

Column name	Original column name	Details
sample_id	Sample ID	Unique string identifying each subject
patient_cohort	Patient’s Cohort	Cohort 1: previously used samples; Cohort 2: newly added samples
sample_origin	Sample Origin	BPTB: Barts Pancreas Tissue Bank, London, UK; ESP: Spanish National Cancer Research Centre, Madrid, Spain; LIV: Liverpool University, UK; UCL: University College London, UK
age	Age	Age in years
sex	Sex	M = male, F = female
diagnosis	Diagnosis (1=Control, 2=Benign, 3=PDAC)	1 = control (no pancreatic disease), 2 = benign hepatobiliary disease (119 of which are chronic pancreatitis), 3 = Pancreatic ductal adenocarcinoma (pancreatic cancer)
stage	Stage	For those with pancreatic cancer, one of IA, IB, IIA, IIIB, III, IV
benign_sample_diagnosis	Benign Samples Diagnosis	For those with a benign, non-cancerous diagnosis, what was the diagnosis?
plasma_CA19_9	Plasma CA19-9 U/ml	Blood plasma levels of CA 19–9, often elevated in pancreatic cancer. Only assessed in 350 patients.
creatinine	Creatinine mg/ml	Urinary biomarker of kidney function
LYVE1	LYVE1 ng/ml	Urinary levels of Lymphatic vessel endothelial hyaluronan receptor 1, may play a role in tumor metastasis
REG1B	REG1B ng/ml	Urinary levels of a protein that may be associated with pancreas regeneration
TFF1	TFF1 ng/ml	Urinary levels of Trefoil Factor 1, related to regeneration and repair of urinary tract
REG1A	REG1A ng/ml	Urinary levels of a protein associated with pancreas regeneration. Only assessed in 306 patients.

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# 통계 검정
from scipy.stats import f_oneway
from scipy.stats import chi2_contingency
# ml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# 모델 후보
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# pytorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

Data

df = pd.read_csv('../../../delete/Debernardi et al 2020 data.csv')

df

	sample_id	patient_cohort	sample_origin	age	sex	diagnosis	stage	benign_sample_diagnosis	plasma_CA19_9	creatinine	LYVE1	REG1B	TFF1	REG1A
0	S1	Cohort1	BPTB	33	F	1	NaN	NaN	11.7	1.83222	0.893219	52.948840	654.282174	1262.000
1	S10	Cohort1	BPTB	81	F	1	NaN	NaN	NaN	0.97266	2.037585	94.467030	209.488250	228.407
2	S100	Cohort2	BPTB	51	M	1	NaN	NaN	7.0	0.78039	0.145589	102.366000	461.141000	NaN
3	S101	Cohort2	BPTB	61	M	1	NaN	NaN	8.0	0.70122	0.002805	60.579000	142.950000	NaN
4	S102	Cohort2	BPTB	62	M	1	NaN	NaN	9.0	0.21489	0.000860	65.540000	41.088000	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
585	S549	Cohort2	BPTB	68	M	3	IV	NaN	NaN	0.52026	7.058209	156.241000	525.178000	NaN
586	S558	Cohort2	BPTB	71	F	3	IV	NaN	NaN	0.85956	8.341207	16.915000	245.947000	NaN
587	S560	Cohort2	BPTB	63	M	3	IV	NaN	NaN	1.36851	7.674707	289.701000	537.286000	NaN
588	S583	Cohort2	BPTB	75	F	3	IV	NaN	NaN	1.33458	8.206777	205.930000	722.523000	NaN
589	S590	Cohort1	BPTB	74	M	3	IV	NaN	1488.0	1.50423	8.200958	411.938275	2021.321078	13200.000

590 rows × 14 columns

df.describe()

	age	diagnosis	plasma_CA19_9	creatinine	LYVE1	REG1B	TFF1	REG1A
count	590.000000	590.000000	350.000000	590.000000	590.000000	590.000000	590.000000	306.000000
mean	59.079661	2.027119	654.002944	0.855383	3.063530	111.774090	597.868722	735.281222
std	13.109520	0.804873	2430.317642	0.639028	3.438796	196.267110	1010.477245	1477.247724
min	26.000000	1.000000	0.000000	0.056550	0.000129	0.001104	0.005293	0.000000
25%	50.000000	1.000000	8.000000	0.373230	0.167179	10.757216	43.961000	80.692000
50%	60.000000	2.000000	26.500000	0.723840	1.649862	34.303353	259.873974	208.538500
75%	69.000000	3.000000	294.000000	1.139482	5.205037	122.741013	742.736000	649.000000
max	89.000000	3.000000	31000.000000	4.116840	23.890323	1403.897600	13344.300000	13200.000000

for col in df.select_dtypes(include=['object', 'category']).columns:
    print(f"\n📊 {col}")
    print(df[col].value_counts(dropna=False))


📊 sample_id
S1      1
S588    1
S302    1
S288    1
S497    1
       ..
S282    1
S321    1
S323    1
S363    1
S590    1
Name: sample_id, Length: 590, dtype: int64

📊 patient_cohort
Cohort1    332
Cohort2    258
Name: patient_cohort, dtype: int64

📊 sample_origin
BPTB    409
LIV     132
ESP      29
UCL      20
Name: sample_origin, dtype: int64

📊 sex
F    299
M    291
Name: sex, dtype: int64

📊 stage
NaN    391
III     76
IIB     68
IV      21
IB      12
IIA     11
II       7
IA       3
I        1
Name: stage, dtype: int64

📊 benign_sample_diagnosis
NaN                                                             382
Pancreatitis                                                     41
Pancreatitis (Chronic)                                           35
Gallstones                                                       21
Pancreatitis (Alcohol-Chronic)                                   11
Cholecystitis                                                     9
Serous cystadenoma - NOS                                          7
Abdominal Pain                                                    6
Choledocholiathiasis                                              6
Pancreatitis (Alcohol-Chronic-Pseuodcyst)                         4
Pancreatitis (Gallstone)                                          4
Pancreatitis (Pseudocyst)                                         4
Pancreatitis (Idiopathic)                                         4
Pancreatitis (Autoimmune)                                         3
Serous microcystic adenoma                                        3
Gallstones - Incidental                                           3
Cholecystitis (Chronic)                                           3
Premalignant lesions-Mucinous cystadenoma-NOS                     3
Gallstones                                                        2
Gallbladder polyps                                                2
Premalignant lesions-Adenoma-NOS                                  2
Pancreatitis (Chronic-Pseudocyst)                                 2
Cholecystitis (Chronic) Cholelithiasis                            2
Premalignant lesions-Villous adenoma-NOS                          2
Pancreatitis (Gallstone-Pseudocyst)                               1
Pancreatitis (Gallstone-Alcohol-Pseudocyst)                       1
Pancreato-jejunostomy Anastomoses Stricture                       1
Premalignant lesions-Tubulovillous adenoma-NOS                    1
Pancreatitis (Chronic) Choledocholithiasis                        1
Premalignant lesions-Mucinous cystadenocarcinoma-noninvasive      1
Pancreatitis (Hereditary-Chronic)                                 1
Pancreatitis (Hypertriglyceridemia)                               1
Pancreatitis (Idiopathic)                                         1
Premalignant lesions-Tubular adenoma-NOS                          1
Pancreatitis (Abscess)                                            1
Pancreatitis (Chronic) (Later became PDAC)                        1
Pancreatitis (Alcohol)                                            1
Biliary Stricture (Secondary to Stent)                            1
Cholecystitis                                                     1
Cholecystitis (Chronic) Cholesterolsis                            1
Choledochal Cyst                                                  1
Choledocholiathiasis                                              1
Cholelithiasis with adenomyomatous hyperplasia                    1
Duodenal Stricture                                                1
Duodenitis                                                        1
Gallbladder Porcelain                                             1
Gastritis                                                         1
Gastritis and Reflux                                              1
Ill defined lesion in uncinate process                            1
Ischaemic Common Bile Duct Stricture                              1
Pancreatitis                                                      1
Pancreatitis (Acute)                                              1
Simple benign liver cyst                                          1
Name: benign_sample_diagnosis, dtype: int64

crosstab between sex

ct = pd.crosstab(df['sex'], df['diagnosis_label'])
print(ct)

diagnosis_label  Benign  Control  PDAC
sex                                   
F                   101      115    83
M                   107       68   116

Within-sex Proportions

print("\nWithin-sex proportions (%):")
print((ct.div(ct.sum(axis=1), axis=0) * 100).round(2))


Within-sex proportions (%):
diagnosis_label  Benign  Control   PDAC
sex                                    
F                 33.78    38.46  27.76
M                 36.77    23.37  39.86

남성이 많이 췌장암 발생하는 경향
성별 다른지 chi-square test

chi2, p, dof, expected = chi2_contingency(ct)
print(f"Chi-square test p-value = {p:.4f}")

Chi-square test p-value = 0.0001

다름

visualization

prop = ct.div(ct.sum(axis=1), axis=0)
prop.plot(kind='bar', stacked=True, figsize=(6,4))
plt.title('Diagnosis distribution by Sex')
plt.ylabel('Proportion')
plt.legend(title='Diagnosis')
plt.tight_layout()
plt.show()

시각화해보면 딱 보임

diagnosis level visualization by Biomarker

df['diagnosis_label'] = df['diagnosis'].map({1:'Control', 2:'Benign', 3:'PDAC'})

biomarkers = ['LYVE1', 'REG1B', 'TFF1', 'REG1A', 'plasma_CA19_9', 'creatinine']

plt.figure(figsize=(14, 8))
for i, biomarker in enumerate(biomarkers, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(data=df, x='diagnosis_label', y=biomarker, palette='Set2')
    plt.title(biomarker)
    plt.tight_layout()
plt.show()

췌장암 환자 PDAC
- LYVE1이 높음. REG1B가 높음 TFF1이 높음, REF1A가 높음

Spearman Correlation

corr = df[biomarkers].corr(method="spearman", min_periods=20)
plt.figure(figsize=(8,7))
plt.imshow(corr, aspect="auto", interpolation="nearest")
plt.xticks(range(len(biomarkers)), biomarkers, rotation=45, ha="right")
plt.yticks(range(len(biomarkers)), biomarkers)
plt.title("Spearman correlation (biomarkers)")
plt.colorbar()
plt.tight_layout()
plt.show()

Statistical Test

for biomarker in biomarkers:
    groups = [df[df['diagnosis_label'] == g][biomarker].dropna() for g in df['diagnosis_label'].unique()]
    stat, p = f_oneway(*groups)
    print(f"{biomarker}: p-value = {p:.4f}")

LYVE1: p-value = 0.0000
REG1B: p-value = 0.0000
TFF1: p-value = 0.0000
REG1A: p-value = 0.0000
plasma_CA19_9: p-value = 0.0000
creatinine: p-value = 0.1894

dignose level 별로 그룹 간 차이가 있는지
- creatime 빼고 다 차이 있음

Random Forest

target, features

X = df[biomarkers].fillna(0)
y = df['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

randomforest

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

prediction

y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.76      0.76      0.76        41
           2       0.61      0.59      0.60        39
           3       0.85      0.87      0.86        38

    accuracy                           0.74       118
   macro avg       0.74      0.74      0.74       118
weighted avg       0.74      0.74      0.74       118

Confusion Matrix

cm = confusion_matrix(y_test, y_pred, labels=[1,2,3])
plt.figure(figsize=(7,6))
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion Matrix (RF)")
plt.xticks([0,1,2], ["1-Control","2-Benign","3-PDAC"], rotation=20)
plt.yticks([0,1,2], ["1-Control","2-Benign","3-PDAC"])
for i in range(3):
    for j in range(3):
        plt.text(j, i, cm[i, j], ha="center", va="center")
plt.colorbar()
plt.tight_layout()
plt.show()

Check Feature Importance

importances = pd.Series(rf.feature_importances_, index=biomarkers).sort_values(ascending=False)
importances

plasma_CA19_9    0.241888
LYVE1            0.216604
TFF1             0.181022
REG1B            0.146198
creatinine       0.123882
REG1A            0.090405
dtype: float64

sns.barplot(x=importances, y=importances.index)
plt.title('Feature Importance')
plt.show()

Logistic Regression

feature_cols = ["age", "creatinine", "LYVE1", "REG1B", "TFF1", "REG1A", "plasma_CA19_9"]
X = df[feature_cols].copy()
X["sex"] = df["sex"].map({"M":0, "F":1})
y = df["diagnosis"]

X = X.fillna(X.median())
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

log_reg = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(max_iter=200, multi_class='ovr'))
])
log_reg.fit(X_train, y_train)
print("\n--- Logistic Regression ---")
print(classification_report(y_test, log_reg.predict(X_test)))


--- Logistic Regression ---
              precision    recall  f1-score   support

           1       0.62      0.63      0.62        46
           2       0.60      0.52      0.56        52
           3       0.75      0.84      0.79        50

    accuracy                           0.66       148
   macro avg       0.66      0.66      0.66       148
weighted avg       0.66      0.66      0.66       148

svm = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', SVC(kernel='rbf', probability=True))
])
svm.fit(X_train, y_train)
print("\n--- SVM (RBF kernel) ---")
print(classification_report(y_test, svm.predict(X_test)))


--- SVM (RBF kernel) ---
              precision    recall  f1-score   support

           1       0.61      0.72      0.66        46
           2       0.64      0.48      0.55        52
           3       0.76      0.84      0.80        50

    accuracy                           0.68       148
   macro avg       0.67      0.68      0.67       148
weighted avg       0.67      0.68      0.67       148

for name, model in [("LR", log_reg), ("SVM", svm)]:
    y_proba = model.predict_proba(X_test)
    auc = roc_auc_score(pd.get_dummies(y_test), y_proba, multi_class='ovr')
    print(f"log_reg macro AUC: {auc:.3f}")

log_reg macro AUC: 0.849
log_reg macro AUC: 0.843

AUC 별 차이 없음

biomarkers = ["age", "creatinine", "LYVE1", "REG1B", "TFF1", "REG1A", "plasma_CA19_9"]
X = df[biomarkers].copy()
X["sex"] = df["sex"].map({"M":0, "F":1})
y = df["diagnosis"] - 1   # 0,1,2 로 바꿈 (PyTorch용)

X = X.fillna(X.median())
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42, stratify=y)

X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.long)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.long)

data loader

class BiomarkerDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = BiomarkerDataset(X_train, y_train)
test_dataset = BiomarkerDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dims=[128, 64, 32], output_dim=3, dropout=0.4):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dims[0]),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dims[0], hidden_dims[1]),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dims[1], hidden_dims[2]),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dims[2], output_dim)
        )
    def forward(self, x):
        return self.layers(x)

input_dim = X_train.shape[1]
model = MLPClassifier(input_dim=input_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 100

train_losses, test_losses = [], []

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    train_losses.append(total_loss / len(train_loader))

    # Validation loss
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            output = model(X_batch)
            val_loss += criterion(output, y_batch).item()
    test_losses.append(val_loss / len(test_loader))

    if epoch % 10 == 0:
        print(f"Epoch [{epoch}/{epochs}] Train Loss: {train_losses[-1]:.4f}  Val Loss: {test_losses[-1]:.4f}")

Epoch [0/100] Train Loss: 1.0805  Val Loss: 1.0263
Epoch [10/100] Train Loss: 0.7961  Val Loss: 0.7018
Epoch [20/100] Train Loss: 0.7519  Val Loss: 0.6944
Epoch [30/100] Train Loss: 0.6993  Val Loss: 0.6584
Epoch [40/100] Train Loss: 0.7230  Val Loss: 0.6667
Epoch [50/100] Train Loss: 0.6858  Val Loss: 0.6535
Epoch [60/100] Train Loss: 0.6685  Val Loss: 0.6601
Epoch [70/100] Train Loss: 0.6517  Val Loss: 0.6542
Epoch [80/100] Train Loss: 0.6436  Val Loss: 0.6437
Epoch [90/100] Train Loss: 0.6599  Val Loss: 0.6618

plt.plot(train_losses, label="Train Loss")
plt.plot(test_losses, label="Val Loss")
plt.legend()
plt.title("Training Curve")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()

model.eval()

MLPClassifier(
  (layers): Sequential(
    (0): Linear(in_features=8, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.4, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.4, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.4, inplace=False)
    (9): Linear(in_features=32, out_features=3, bias=True)
  )
)

with torch.no_grad():
    preds = model(X_test.to(device)).argmax(dim=1).cpu().numpy()

print(classification_report(y_test, preds, target_names=["Control","Benign","PDAC"]))

              precision    recall  f1-score   support

     Control       0.61      0.89      0.73        46
      Benign       0.75      0.35      0.47        52
        PDAC       0.81      0.92      0.86        50

    accuracy                           0.71       148
   macro avg       0.72      0.72      0.69       148
weighted avg       0.73      0.71      0.68       148

cm = confusion_matrix(y_test, preds)
disp = ConfusionMatrixDisplay(cm, display_labels=["Control","Benign","PDAC"])
disp.plot(cmap="Blues")
plt.show()